Goal: we want to approximate the dominant eigenfunctions of the transfer operator underlying some observed dynamics.
To do this, we need to choose a good basis set. The variational principle of molecular kinetics lets us choose the optimal linear combinations of a given set of basis functions. In MSMs, these basis functions are indicator functions on a partitioning of the configuration space. In tICA, these basis functions are arbitrary input variables, usually internal coordinates like dihedral angles.
Another approach to constructing a basis set is to measure RMSD or weighted RMSD to some reference structures. A major benefit of that approach is that the model should now be differentiable with respect to the parameters defining the basis functions-- the locations of the references, and their associated weights and scale parameters.
In [27]:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
%matplotlib inline
import mdtraj
plt.rc('font', family='serif')
In [28]:
# fetch example data
from msmbuilder.example_datasets import FsPeptide
dataset = FsPeptide().get()
fs_trajectories = dataset.trajectories
fs_t = fs_trajectories[0]
In [29]:
# 1. internal coordinate basis sets
from msmbuilder.featurizer import DihedralFeaturizer
basis_sets = dict()
dih_model=DihedralFeaturizer()
X = dih_model.fit_transform(fs_trajectories)
basis_sets['dihedral_phi_psi'] = X
dih_model=DihedralFeaturizer(types=['phi', 'psi', 'omega', 'chi1', 'chi2', 'chi3', 'chi4'])
X = dih_model.fit_transform(fs_trajectories)
basis_sets['dihedral_all'] = X
In [7]:
# 2. RMSD to reg-space refs
references = [x[::4000] for x in fs_trajectories]
refs = references[0]
for ref in references[1:]:
refs = refs + ref
# oh, this is already built into MSMBuilder
#def compute_rmsds_to_refs(trajectories,refs):
# basis_exps = []
# for traj in trajectories:
# rmsd_to_refs = np.zeros((len(traj),len(refs)))
# for i in range(len(refs)):
# rmsd_to_refs[:,i] = mdtraj.rmsd(traj,refs,i)
# basis_exps.append(rmsd_to_refs)
# return basis_exps
#basis_sets['rmsd_reg'] = compute_rmsds_to_refs(fs_trajectories,refs)
from msmbuilder.featurizer import RMSDFeaturizer
rmsdf = RMSDFeaturizer(refs)
basis_sets['rmsd_reg'] = rmsdf.fit_transform(fs_trajectories)
print(len(refs))
In [8]:
# 3. RMSD to cluster-center refs
# pick cluster centers by k-medoids
from msmbuilder.cluster import MiniBatchKMedoids
kmed = MiniBatchKMedoids(50,batch_size=200)
kmed.fit(X)
Out[8]:
In [9]:
# extract examplar configurations
clever_refs = []
for ind in kmed.cluster_ids_:
clever_refs.append(fs_trajectories[ind[0]][ind[1]])
# convert list of length-1 mdtraj Trajectories to a single trajectory
# -- currently doing this in the most inefficient way possible, but it's
# a tiny list so it doesn't matter
clever_ref = clever_refs[0]
for i in range(1,len(clever_refs)):
clever_ref = clever_ref + clever_refs[i]
clever_refs = clever_ref
print(len(clever_refs))
In [10]:
rmsdf = RMSDFeaturizer(clever_refs)
basis_sets['rmsd_kmed'] = rmsdf.fit_transform(fs_trajectories)
In [11]:
rmsd_kmed_basis = np.array(basis_sets['rmsd_kmed'])
In [26]:
# 4. wRMSD to reg-space refs
from MDAnalysis.analysis.rms import rmsd as wRMSD
# compute weights from atomwise deviations
atomwise_deviations = np.load('fs_atomwise_deviations_tau=20.npy')
mean = atomwise_deviations.mean(0)
weights = np.exp(-mean/0.065)
In [84]:
def compute_wrmsd_traj(trajectories,refs,weights):
''' compute wRMSDs from trajectories to a list of references'''
basis_exps = []
for traj in trajectories:
wrmsd_to_refs = np.zeros((len(traj),len(refs)))
for i in range(len(traj)):
for j in range(len(refs)):
wrmsd_to_refs[i,j] = wRMSD(refs.xyz[j],traj.xyz[i],weights=weights,center='True')
basis_exps.append(wrmsd_to_refs)
return basis_exps
weighted_trajs = compute_wrmsd_traj(fs_trajectories,refs,weights)
basis_sets['wrmsd_exp'] = weighted_trajs
In [77]:
#np.save('wrmsd_exp.npy',weighted_trajs)
#np.save('wrmsd_inv.npy',inv_mean_weighted_trajs)
#np.save('wrmsd_mean.npy',mean_weighted_trajs)
In [30]:
# loading from file
weighted_trajs = np.load('wrmsd_exp.npy')
weighted_trajs.shape # 28 trajectories, 10k frames / traj, 84 basis fxns
Out[30]:
In [31]:
# grid search over alphas, assuming the same alpha for all
alphas = np.logspace(-10,10,base=np.e)
for i,alpha in enumerate(alphas):
basis_sets['wrmsd_gauss_{0}'.format(i)] = np.exp(-(weighted_trajs)**2 / (alpha*np.ones(84)))
In [43]:
rmsd_basis = np.array(basis_sets['rmsd_reg'])
In [44]:
for i,alpha in enumerate(alphas):
basis_sets['rmsd_gauss_{0}'.format(i)] = np.exp(-(rmsd_basis)**2 / (alpha*np.ones(84)))
In [14]:
for i,alpha in enumerate(alphas):
basis_sets['rmsd_kmed_gauss_{0}'.format(i)] = np.exp(-(rmsd_kmed_basis)**2 / (alpha*np.ones(50)))
In [24]:
basis_sets['rmsd_kmed_gauss_18']
Out[24]:
In [37]:
alphas[18]
Out[37]:
In [66]:
# 5. indicator functions
clusters = kmed.transform(X)
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(n_values=np.max(np.vstack(clusters))+1,
sparse=False)
cluster_enc = enc.fit_transform(clusters[0].reshape((len(clusters[0]),1)))
In [65]:
cluster_enc
In [163]:
from msmbuilder.msm import MarkovStateModel
msm = MarkovStateModel()
msm.fit(clusters)
msm.score_
Out[163]:
In [165]:
plt.imshow(msm.transmat_,interpolation='none')
Out[165]:
In [15]:
from msmbuilder.decomposition import tICA
def score_via_tica(basis_set,m=10,lag_time=100):
''' Since the optimal matrix of expansion coefficients A is given by
concatenating the generalized eigenvectors of the overlap and time-lagged overlap
matrices, we can directly measure the quality of a candidate basis set for rank-m
transfer operator approximation by performing tICA, then summing the top m eigenvalues.
'''
tica = tICA(lag_time=lag_time)
tica.fit(basis_set)
return np.sum(tica.eigenvalues_[:m])
def score_via_gmrq(basis_set,expansion_coeffs=None,m=10,lag_time=100):
### WARNING: this is buggy-- this *should* give the same answer as tICA
# if you pass it the same basis_set and don't specify expansion_coeffs
# but the answer's off by ~10% on examples.
# I'm also not handling singular matrices correctly I think.
tau = lag_time
# compute the overlap and time-lagged overlap matrices
S = basis_set.T.dot(basis_set) / len(basis_set)
C = basis_set[tau:].T.dot(basis_set[:-tau]) / len(basis_set - tau)
# expansion coefficients
if expansion_coeffs == None:
import scipy.linalg as la
eig_vals,eig_vecs = la.eig(C,S)
# discard imaginary component
eig_vals = np.array(eig_vals,dtype=float)
eig_vecs = np.array(eig_vecs,dtype=float)
A = eig_vecs[:,:m]
#A = np.diag(np.ones(basis_set.shape[1]))
else:
A = expansion_coeffs
# compute the GMRQ from these overlap matrices
P = A.T.dot(C).dot(A)
Q = A.T.dot(S).dot(A)
try:
Q_inv = np.linalg.inv(Q)
except:
# it could be singular
Q_inv = np.linalg.pinv(Q)
P_Q_inv = P.dot(Q_inv)
diag = np.sort(np.diag(P_Q_inv))[::-1]
partial_trace = diag[:m]
return P_Q_inv,np.sum(partial_trace)
In [16]:
score = score_via_tica
In [156]:
# actually, I might need to make a small modification to GMRQ to accept
# direct eigenfunction approximations that
In [157]:
P_Q_inv,gmrq = score_via_gmrq(weighted_trajs[0])
P_Q_inv.shape,gmrq
Out[157]:
In [159]:
# these should be equal: indicates a problem
gmrq,score_via_tica([weighted_trajs[0]])
Out[159]:
In [49]:
X[0].shape
Out[49]:
In [ ]:
scores = dict()
In [ ]:
clusters = kmed.fit_transform(basis_sets['
In [45]:
m=20
lag_time=100
for name in basis_sets:
if name not in scores:
scores[name] = score(basis_sets[name],m=m,lag_time=lag_time)
name_ = name+'_renorm'
if name_ not in scores:
try:
renorm = [(traj.T / traj.T.sum(0)).T for traj in basis_sets[name]]
scores[name_] = score(renorm,m=m,lag_time=lag_time)
except:
pass
In [51]:
renorm = [(traj.T / traj.T.sum(0)).T for traj in basis_sets['wrmsd_gauss_16']]
In [55]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(np.vstack(renorm))
X = pca.transform(np.vstack(renorm))
In [56]:
plt.plot(pca.explained_variance_ratio_)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
Out[56]:
In [58]:
plt.scatter(X[:,0],X[:,1],s=1,c=range(len(X)),linewidths=0)
Out[58]:
In [64]:
for i in range(5):
plt.bar(range(len(pca.components_[i])),pca.components_[i])
plt.title('Component {0}'.format(i+1))
plt.figure()
In [49]:
sorted(scores.items(),key=lambda item:-item[1])
Out[49]:
In [46]:
sorted(scores.items(),key=lambda item:-item[1])
Out[46]:
In [89]:
# 'archive'
scores.items()
Out[89]:
In [ ]:
Other things to try: